import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
df = pd.read_csv('movies.csv')
df.rename(columns={'gross':'revenue'}, inplace=True)
df.info()
#Checking for missing data more closely
for col in df.columns:
pct_missing = round((df[col].isnull().sum()/df[col].shape[0])*100)
missing = np.sum(df[col].isnull())
print('{} - {} - {}%'.format(col, missing, pct_missing))
The 'Budget' column has too many missing values to replace them with the mean. We will restrict our dataframe to only include movies that have the budget. The new dataframe of reference will be 'df2'.
df2 = df.loc[df['budget'].isna()==False]
df2.shape
df2.isna().sum()
gross_mean = df2['revenue'].mean() df2['revenue'].fillna(gross_mean, inplace=True)
# fixing columns of interest
df2["year"] = pd.to_datetime(df2["year"], format='%Y')
df2.dtypes
df2 = df2.copy()
# Setting multiple items using a mask
mask = df2['revenue'].isna()
df2.loc[mask] = df2['revenue'].mean()
#Checking for data types
df2.head()
# fixing columns of interest
df2['revenue'] = df2['revenue'].astype('int64')
df2['budget'] = df2['budget'].astype('int64')
df2.head(2)
df2.sort_values(by=['revenue'], inplace=True, ascending=False)
# Drop duplicates
df2.drop_duplicates(inplace=True)
df2.head(5)
pd.plotting.scatter_matrix(df2, alpha=0.2);
correlation_matrix = df2.corr(method='pearson')
correlation_matrix
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric features')
plt.xlabel('Features')
plt.ylabel('Features')
print("⚠️NOTE: The brighter the box, the higher the correlation.")
It looks like higher budgets for a movie result in both higher revenues and better votes. Let's dive deeper.
We will explore the following:
And then we will use a bubble chart to graphically assess the relationship between the three variables at once, depending on the movie genre.
plt.scatter(x=df2['budget'], y=df2['revenue'])
plt.title('Budget vs Gross Revenue')
plt.xlabel('Budget')
plt.ylabel('Gross Revenue');
# Plot budget vs gross using seaborn
sns.regplot(x='budget', y='revenue',data=df2, scatter_kws={"color":"red"},line_kws={'color':"blue"})
plt.scatter(x=df2['budget'], y=df2['votes'])
plt.title('Budget vs Votes')
plt.xlabel('Budget')
plt.ylabel('Votes');
sns.regplot(x='budget', y='votes',data=df2, scatter_kws={"color":"red"},line_kws={'color':"green"})
df2_restricted = df2[["genre", "budget", "revenue", "votes"]]
df2_grouped = df2_restricted.groupby('genre').mean()
for col in df2_grouped.columns:
df2_grouped[col] = df2_grouped[col].astype('int64')
df2_grouped
import plotly.express as px
fig = px.scatter(df2_grouped, x="budget", y="votes", size="revenue", color=df2_grouped.index, hover_name=df2_grouped.index, log_x=True, size_max=60)
fig.show()
Well...the more money you pump into your movie, the higher the votes it receives! It may have have something to do with the amount of advertising involved, which is something we are not considering in this analysis.
In any case, the correlation between budget and revenue is also quite strong. It looks like as the budget increases, the size of the bubbles (that indicates the revenue) also tends to get bigger.
That is especially true for Action movies and Animation movies. No surprise there...they are my favorite genres!
df2_grouped.corr()
I really like action movies, and I would like them to make more money than adventure movies do. Let's see if, on average, the revenue for action movies is grater than that for adventure movies.
We will use bootstrapping to simulate 10000 samples of 1200 items from the df2 dataset. We will display the distribution of the sample mean in an histogram.
We want to be 95% sure that we will be coorect within a +- error. The formula for determining the sample size for comparing means is:
In our case, we need:
z_score = 1.96
std = np.std(df2_grouped["revenue"])
err = 0.5
sample_size = (z_score) * (std**2) / err**2
sample_size
df2
diff = []
for n in range(10000):
bootstrap = df2.sample(1200, replace=True)
revenue_action = round(bootstrap[bootstrap['genre']=='Action']['revenue'].mean())
revenue_adventure = round(bootstrap[bootstrap['genre']=='Adventure']['revenue'].mean())
diff.append(revenue_action - revenue_adventure)
plt.hist(diff);
np.percentile(bootstrap['revenue'], 2.5), np.percentile(bootstrap['revenue'], 97.5), np.mean(diff)
We can be 95% confident that there is a difference in the average revenue for action movies and for adventure movies.
Let's convert all non-numeric data to numeric data, and store that in a new dataframe, 'df_numerised'
df_numerised = df2
for col in df_numerised.columns:
if(df_numerised[col].dtype == 'object'):
df_numerised[col] = df_numerised[col].astype('category')
df_numerised[col] = df_numerised[col].cat.codes
correlation_matrix_2 = df_numerised.corr(method='pearson')
correlation_matrix_2
sns.heatmap(correlation_matrix_2, annot=True)
plt.title('Correlation Matrix for Non-Numeric features')
plt.xlabel('Features')
plt.ylabel('Features')
print("⚠️NOTE: The brighter the box, the higher the correlation.")
correlation_matrix_2 = df_numerised.corr(method='pearson')
corr_pairs = correlation_matrix_2.unstack()
corr_pairs
sorted_pairs = corr_pairs.sort_values()
high_corr = sorted_pairs[(sorted_pairs)>0.5]
high_corr